{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.append(os.path.abspath('../../code'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import regex\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from utils.io import read_label_config\n",
    "from utils.corpus import DoccanoAnnotationsCorpus\n",
    "\n",
    "from collections import Counter\n",
    "\n",
    "from tqdm.auto import tqdm\n",
    "\n",
    "np.set_printoptions(formatter={'float': lambda x: \"{0:0.3f}\".format(x)})\n",
    "pd.set_option('display.max_columns', 15)\n",
    "pd.set_option('display.width', 320)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from types import SimpleNamespace\n",
    "\n",
    "args = SimpleNamespace()\n",
    "args.data_path = '../../data/annotation/annotations/'\n",
    "args.data_folder_pattern = 'uk-manifestos'\n",
    "args.data_annotations_folder = 'annotations'\n",
    "args.data_file_format = 'jsonl'\n",
    "args.keep_annotator = 'emarie,sjasmin'\n",
    "\n",
    "args.label_config_file = '../../data/annotation/doccano_label_config.json'\n",
    "\n",
    "args.output_file = '../../data/annotation/parsed/uk-manifestos_annotations.jsonl'\n",
    "args.overwrite_output = False\n",
    "\n",
    "args.verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern]\n",
    "annotators = [a.strip() for a in args.keep_annotator.split(',')]\n",
    "fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]\n",
    "fps.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cat2code = read_label_config(args.label_config_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read first (we merge the rest to this one)\n",
    "acorp = DoccanoAnnotationsCorpus(cat2code)\n",
    "annotator = os.path.basename(fps[0]).replace('.jsonl', '')\n",
    "acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-other-parties/annotations/sjasmin.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-01/annotations/emarie.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-01/annotations/sjasmin.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-02/annotations/emarie.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-02/annotations/sjasmin.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-03/annotations/emarie.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-03/annotations/sjasmin.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-2017+19/annotations/emarie.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-2017+19/annotations/sjasmin.jsonl'\n",
      "Warning: annotation of token 20 in document 2f6771e692f900c77477e074d4d6deb9 disambiguated\n"
     ]
    }
   ],
   "source": [
    "# merge remaining ones\n",
    "for fp in fps[1:]:\n",
    "    if args.verbose: print(f'Reading annotations from file \\'{fp}\\'')\n",
    "    tmp = DoccanoAnnotationsCorpus(cat2code)\n",
    "    annotator = os.path.basename(fp).replace('.jsonl', '')\n",
    "    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)\n",
    "    acorp.merge_annotated_corpus(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m2f6771e692f900c77477e074d4d6deb9\u001b[0m\n",
      "'Only Labour will offer the choice of remaining in the EU , or leaving with a sensible deal .'\n",
      "      \u001b[44m      \u001b[49m                                       \u001b[44m   \u001b[49m \u001b[44m  \u001b[49m                                    \t(sjasmin)\n"
     ]
    }
   ],
   "source": [
    "# inspect those where warnings raised\n",
    "print(acorp.docs[acorp.doc_id2idx['2f6771e692f900c77477e074d4d6deb9']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### merge gold labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "fp = os.path.join(args.data_path, 'ra-annotation-uk-manifestos-review', 'annotations', 'all.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']\n",
    "cats = [t+c for t in ['I-', 'B-'] for c in cats]\n",
    "cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}\n",
    "cat2code_gold['O'] = acorp.outside_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=len(cats))\n",
    "gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check that doc IDs in gold data match those in the annotations\n",
    "all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge the gold labels to the annotations corpus\n",
    "acorp.merge_gold_corpus(gold_corp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m30426f95b0aeaf60be5831dce2e28490\u001b[0m\n",
      "'It also means that parents , mainly mothers , who take a full year are also losing out on pension provision .'\n",
      "                    \u001b[44m       \u001b[49m \u001b[44m \u001b[49m \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m \u001b[44m \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m                                           \t(emarie)\n",
      "                    \u001b[44m       \u001b[49m          \u001b[44m       \u001b[49m                                                                  \t(sjasmin)\n",
      "                    \u001b[42m       \u001b[49m \u001b[42m \u001b[49m \u001b[42m      \u001b[49m \u001b[42m       \u001b[49m \u001b[42m \u001b[49m \u001b[42m   \u001b[49m \u001b[42m    \u001b[49m \u001b[42m \u001b[49m \u001b[42m    \u001b[49m \u001b[42m    \u001b[49m                                           \t[GOLD]\n"
     ]
    }
   ],
   "source": [
    "print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[99]]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No. docs: 8596\n",
      "(array([1, 2]), array([5609, 2987]))\n"
     ]
    }
   ],
   "source": [
    "print('No. docs:', acorp.ndocs)\n",
    "# how many singly/multiply annotated?\n",
    "print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# identify duplicate texts (if any)\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1, 2]), array([8556,   20]))\n"
     ]
    }
   ],
   "source": [
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))\n",
    "# 20 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get IDs of documents with dublicated text\n",
    "duplicated = [t for t, n in texts.most_common() if n > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map doc IDs to texts\n",
    "duplicates_ids = dict()\n",
    "for doc in acorp.docs:\n",
    "    if doc.text in duplicated:\n",
    "        if doc.text in duplicates_ids.keys():\n",
    "            duplicates_ids[doc.text].append(doc.id)\n",
    "        else:\n",
    "            duplicates_ids[doc.text] = [doc.id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m0ccd795eb82b198ee3ef7ac9712ebc13\u001b[0m\n",
      "'Move towards introducing ‘ safe standing ’ at football clubs , requiring the Sports Grounds Safety Authority to prepare guidance for implementing this change .'\n",
      "                                               \u001b[44m        \u001b[49m \u001b[44m     \u001b[49m                 \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m \u001b[44m         \u001b[49m                                                   \t(emarie)\n",
      "                                                                          \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m \u001b[44m         \u001b[49m                                                   \t(sjasmin)\n",
      "                                               \u001b[42m        \u001b[49m \u001b[42m     \u001b[49m             \u001b[42m   \u001b[49m \u001b[42m      \u001b[49m \u001b[42m       \u001b[49m \u001b[42m      \u001b[49m \u001b[42m         \u001b[49m                                                   \t[GOLD]\n",
      "\u001b[1m3b2a4eff3fabc533869d3780d8536fbe\u001b[0m\n",
      "'Move towards introducing ‘ safe standing ’ at football clubs , requiring the Sports Grounds Safety Authority to prepare guidance for implementing this change .'\n",
      "                                               \u001b[44m        \u001b[49m \u001b[44m     \u001b[49m             \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m \u001b[44m      \u001b[49m \u001b[44m         \u001b[49m                                                   \t(emarie)\n",
      "                                                                                                                                                                \t(sjasmin)\n",
      "                                               \u001b[42m        \u001b[49m \u001b[42m     \u001b[49m             \u001b[42m   \u001b[49m \u001b[42m      \u001b[49m \u001b[42m       \u001b[49m \u001b[42m      \u001b[49m \u001b[42m         \u001b[49m                                                   \t[GOLD]\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m187dd16d75689610f9174603057b3bfa\u001b[0m\n",
      "'The nations of the United Kingdom have long had different needs with regard to funding .'\n",
      "                                                                                         \t(emarie)\n",
      "                                                                                         \t(sjasmin)\n",
      "\u001b[1ma4f174c7dca71c65158db52f7b0e71bc\u001b[0m\n",
      "'The nations of the United Kingdom have long had different needs with regard to funding .'\n",
      "                                                                                         \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m8727567c76bf2b8fbe1b7f2787ec3276\u001b[0m\n",
      "'Protect the independence of the BBC and set up a BBC Licence Fee Commission , maintain Channel 4 in public ownership and protect the funding and editorial independence of Welsh language broadcasters .'\n",
      "                             \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m              \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m          \u001b[49m            \u001b[44m       \u001b[49m \u001b[44m \u001b[49m                                                                           \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m \u001b[44m            \u001b[49m  \t(emarie)\n",
      "                             \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m              \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m          \u001b[49m            \u001b[44m       \u001b[49m \u001b[44m \u001b[49m                                                                           \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m \u001b[44m            \u001b[49m  \t(sjasmin)\n",
      "                             \u001b[42m   \u001b[49m \u001b[42m   \u001b[49m              \u001b[42m   \u001b[49m \u001b[42m       \u001b[49m \u001b[42m   \u001b[49m \u001b[42m          \u001b[49m            \u001b[42m       \u001b[49m \u001b[42m \u001b[49m                                                                           \u001b[42m     \u001b[49m \u001b[42m        \u001b[49m \u001b[42m            \u001b[49m  \t[GOLD]\n",
      "\u001b[1m7c06e3a0984fee5d24e280a23e38c073\u001b[0m\n",
      "'Protect the independence of the BBC and set up a BBC Licence Fee Commission , maintain Channel 4 in public ownership and protect the funding and editorial independence of Welsh language broadcasters .'\n",
      "                             \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m              \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m          \u001b[49m            \u001b[44m       \u001b[49m \u001b[44m \u001b[49m                                                                           \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m \u001b[44m            \u001b[49m  \t(emarie)\n",
      "                             \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m              \u001b[44m   \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m          \u001b[49m            \u001b[44m       \u001b[49m \u001b[44m \u001b[49m                                                                           \u001b[44m     \u001b[49m \u001b[44m        \u001b[49m \u001b[44m            \u001b[49m  \t(sjasmin)\n",
      "                             \u001b[42m   \u001b[49m \u001b[42m   \u001b[49m              \u001b[42m   \u001b[49m \u001b[42m       \u001b[49m \u001b[42m   \u001b[49m \u001b[42m          \u001b[49m            \u001b[42m       \u001b[49m \u001b[42m \u001b[49m                                                                           \u001b[42m     \u001b[49m \u001b[42m        \u001b[49m \u001b[42m            \u001b[49m  \t[GOLD]\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mb8fcbc767a1d9c51ff8b79a45c46bcb8\u001b[0m\n",
      "'When it came to reforming the House of Lords and giving citizens a stronger voice with fair votes , our proposals were blocked .'\n",
      "                           \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m            \u001b[44m        \u001b[49m                                                                \t(emarie)\n",
      "                           \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m            \u001b[44m        \u001b[49m                                                                \t(sjasmin)\n",
      "\u001b[1m760b6392c10197cada53339542796332\u001b[0m\n",
      "'When it came to reforming the House of Lords and giving citizens a stronger voice with fair votes , our proposals were blocked .'\n",
      "                           \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m            \u001b[44m        \u001b[49m                                                                \t(emarie)\n",
      "                           \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m  \u001b[49m \u001b[44m     \u001b[49m                                                                                    \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1md46e090545cb1620fe4423f9dee86f0b\u001b[0m\n",
      "'Our democratic institutions should be representative of and accountable to the people they represent .'\n",
      "     \u001b[44m          \u001b[49m \u001b[44m            \u001b[49m                                                \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m    \u001b[49m \u001b[44m         \u001b[49m  \t(emarie)\n",
      "                                                                            \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m    \u001b[49m \u001b[44m         \u001b[49m  \t(sjasmin)\n",
      "     \u001b[42m          \u001b[49m \u001b[42m            \u001b[49m                                                \u001b[42m   \u001b[49m \u001b[42m      \u001b[49m \u001b[42m    \u001b[49m \u001b[42m         \u001b[49m  \t[GOLD]\n",
      "\u001b[1m82487a1461b4d010a0750dbeaf6b1aab\u001b[0m\n",
      "'Our democratic institutions should be representative of and accountable to the people they represent .'\n",
      "                                                                            \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m    \u001b[49m \u001b[44m         \u001b[49m  \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m8e49b057eda7912d48f74f7581ed7fb3\u001b[0m\n",
      "'We will demand , firstly , that the proposals of the Smith Commission are delivered quickly and in full .'\n",
      "                                                  \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m                                    \t(emarie)\n",
      "                                                  \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m                                    \t(sjasmin)\n",
      "\u001b[1m16a0f7ef12024b2e719954d87f6969a6\u001b[0m\n",
      "'We will demand , firstly , that the proposals of the Smith Commission are delivered quickly and in full .'\n",
      "                                                  \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m                                    \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m54c25996cbee486c56a6b0fdfd2f7c79\u001b[0m\n",
      "'Liberal Democrats wish to see a permanently peaceful , stable , non-sectarian and truly democratic society in Northern Ireland .'\n",
      " \u001b[44m       \u001b[49m \u001b[44m         \u001b[49m               \u001b[44m           \u001b[49m \u001b[44m        \u001b[49m \u001b[44m \u001b[49m \u001b[44m      \u001b[49m \u001b[44m \u001b[49m \u001b[44m             \u001b[49m \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m \u001b[44m       \u001b[49m                      \t(emarie)\n",
      " \u001b[44m       \u001b[49m \u001b[44m         \u001b[49m                                                                                                               \t(sjasmin)\n",
      " \u001b[42m       \u001b[49m \u001b[42m         \u001b[49m               \u001b[42m           \u001b[49m \u001b[42m        \u001b[49m \u001b[42m \u001b[49m \u001b[42m      \u001b[49m \u001b[42m \u001b[49m \u001b[42m             \u001b[49m \u001b[42m   \u001b[49m \u001b[42m     \u001b[49m \u001b[42m          \u001b[49m \u001b[42m       \u001b[49m \u001b[42m  \u001b[49m \u001b[42m        \u001b[49m \u001b[42m       \u001b[49m  \t[GOLD]\n",
      "\u001b[1m2a89d4f048351fe43b0102ccdab00859\u001b[0m\n",
      "'Liberal Democrats wish to see a permanently peaceful , stable , non-sectarian and truly democratic society in Northern Ireland .'\n",
      " \u001b[44m       \u001b[49m \u001b[44m         \u001b[49m               \u001b[44m           \u001b[49m \u001b[44m        \u001b[49m \u001b[44m \u001b[49m \u001b[44m      \u001b[49m \u001b[44m \u001b[49m \u001b[44m             \u001b[49m \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m        \u001b[49m \u001b[44m       \u001b[49m  \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m4239035cc4586cebff92bd7ee5f3acda\u001b[0m\n",
      "'SNP MPs will build a cross-party coalition to scrap Trident as quickly and safely as possible .'\n",
      " \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m                                                                                        \t(sjasmin)\n",
      "\u001b[1md3bbe08694332a5923fddaaec541ba30\u001b[0m\n",
      "'SNP MPs will build a cross-party coalition to scrap Trident as quickly and safely as possible .'\n",
      " \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m                                                                                        \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mec49785ff7550ab699a28fa560cc7178\u001b[0m\n",
      "'We will raise productivity growth in the public sector in order to deliver better schools and a better NHS .'\n",
      "                                      \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m      \u001b[49m                     \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m              \u001b[44m   \u001b[49m  \t(emarie)\n",
      "                                      \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m      \u001b[49m                            \u001b[44m       \u001b[49m              \u001b[44m   \u001b[49m  \t(sjasmin)\n",
      "                                      \u001b[42m   \u001b[49m \u001b[42m      \u001b[49m \u001b[42m      \u001b[49m                     \u001b[42m      \u001b[49m \u001b[42m       \u001b[49m              \u001b[42m   \u001b[49m  \t[GOLD]\n",
      "\u001b[1m3e3eef0abdbfd4ff0301995486b06ae5\u001b[0m\n",
      "'We will raise productivity growth in the public sector in order to deliver better schools and a better NHS .'\n",
      "                                      \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m      \u001b[49m                            \u001b[44m       \u001b[49m              \u001b[44m   \u001b[49m  \t(emarie)\n",
      "                                      \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m \u001b[44m      \u001b[49m                            \u001b[44m       \u001b[49m              \u001b[44m   \u001b[49m  \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m3d30f73ceaecd66835bf90e0970948cd\u001b[0m\n",
      "'We will reduce youth unemployment and reduce the number of children in workless households as part of our strategy for tackling poverty and inequality .'\n",
      "                                                            \u001b[44m        \u001b[49m \u001b[44m  \u001b[49m \u001b[44m        \u001b[49m \u001b[44m          \u001b[49m                                                              \t(emarie)\n",
      "                                                            \u001b[44m        \u001b[49m \u001b[44m  \u001b[49m \u001b[44m        \u001b[49m \u001b[44m          \u001b[49m                                                              \t(sjasmin)\n",
      "\u001b[1mc0055bf852b96fcfcaaa6b51556ade5f\u001b[0m\n",
      "'We will reduce youth unemployment and reduce the number of children in workless households as part of our strategy for tackling poverty and inequality .'\n",
      "                \u001b[44m     \u001b[49m                                       \u001b[44m        \u001b[49m \u001b[44m  \u001b[49m \u001b[44m        \u001b[49m \u001b[44m          \u001b[49m                                                              \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m4fa46261a4c9f8dd65cb01f6688ad784\u001b[0m\n",
      "'We will improve Britain ' s international rankings for tax competitiveness and business regulation .'\n",
      "                                                                                                     \t(emarie)\n",
      "                                                                                                     \t(sjasmin)\n",
      "\u001b[1m000634a1137e300ca28d80e5cf7bd858\u001b[0m\n",
      "'We will improve Britain ' s international rankings for tax competitiveness and business regulation .'\n",
      "                                                                                                     \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m4621b4397e3647f52b2d0eeb0eeda116\u001b[0m\n",
      "'Expand tough ' Community Payback ' for criminals who don ' t go to prison , giving everyone the right to vote on the work they do .'\n",
      "                                        \u001b[44m         \u001b[49m \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m \u001b[44m \u001b[49m \u001b[44m \u001b[49m \u001b[44m  \u001b[49m \u001b[44m  \u001b[49m \u001b[44m      \u001b[49m          \u001b[44m        \u001b[49m                                        \t(emarie)\n",
      "                                        \u001b[44m         \u001b[49m \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m \u001b[44m \u001b[49m \u001b[44m \u001b[49m \u001b[44m  \u001b[49m \u001b[44m  \u001b[49m \u001b[44m      \u001b[49m                                                          \t(sjasmin)\n",
      "\u001b[1mb44f81b945d37ac6a23fedd5b3e28395\u001b[0m\n",
      "'Expand tough ' Community Payback ' for criminals who don ' t go to prison , giving everyone the right to vote on the work they do .'\n",
      "                                        \u001b[44m         \u001b[49m \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m \u001b[44m \u001b[49m \u001b[44m \u001b[49m \u001b[44m  \u001b[49m \u001b[44m  \u001b[49m \u001b[44m      \u001b[49m                                                          \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m9793c4a5aad5f1c86f480756458f346e\u001b[0m\n",
      "'The growing display of indecent material in public places gives offence to many people .'\n",
      "                                                                                 \u001b[44m      \u001b[49m  \t(emarie)\n",
      "\u001b[1m5e4b1c44d3501866e3d4cc231dacb041\u001b[0m\n",
      "'The growing display of indecent material in public places gives offence to many people .'\n",
      "                                                                                 \u001b[44m      \u001b[49m  \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1md1db262b89492a017891815293cd6628\u001b[0m\n",
      "'There can be no military solution to this conflict and all sides must avoid taking action that would make peace harder to achieve .'\n",
      "                 \u001b[44m        \u001b[49m                                                                                                           \t(emarie)\n",
      "\u001b[1m18d2ae18c11a2753abdf0e674d93ded3\u001b[0m\n",
      "'There can be no military solution to this conflict and all sides must avoid taking action that would make peace harder to achieve .'\n",
      "                                                                                                                                    \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m6c640ced0035d202fe3afacd54aa382c\u001b[0m\n",
      "'Create UK Finance for Growth , bringing £4 billion together to provide capital for growing businesses , investing in the growth sectors of the future .'\n",
      "        \u001b[44m  \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m                                                       \u001b[44m       \u001b[49m \u001b[44m          \u001b[49m                                                  \t(emarie)\n",
      "\u001b[1m1646d027ed106b122765cc27862dcb08\u001b[0m\n",
      "'Create UK Finance for Growth , bringing £4 billion together to provide capital for growing businesses , investing in the growth sectors of the future .'\n",
      "        \u001b[44m  \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m      \u001b[49m                                                       \u001b[44m       \u001b[49m \u001b[44m          \u001b[49m                                                  \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m01888a6080b732f44ef912714d0105e6\u001b[0m\n",
      "'Legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'\n",
      "                    \u001b[44m      \u001b[49m \u001b[44m            \u001b[49m                                                                                           \t(emarie)\n",
      "                    \u001b[44m      \u001b[49m \u001b[44m            \u001b[49m                                                                                           \t(sjasmin)\n",
      "\u001b[1m1584822d4842da1925f3865d1191ceca\u001b[0m\n",
      "'Legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'\n",
      "                    \u001b[44m      \u001b[49m \u001b[44m            \u001b[49m                                                                                           \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m385887b3450cf1896c652b0b449fd656\u001b[0m\n",
      "'Stronger local government , with increased local democratic scrutiny over all local public services .'\n",
      "          \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m                                                     \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m \u001b[44m        \u001b[49m  \t(emarie)\n",
      "\u001b[1m9d17a2c048680f24d8e2d4d2c0720ce9\u001b[0m\n",
      "'Stronger local government , with increased local democratic scrutiny over all local public services .'\n",
      "          \u001b[44m     \u001b[49m \u001b[44m          \u001b[49m                                                     \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m \u001b[44m        \u001b[49m  \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1mf822977ed8aa182ee00097da6777496e\u001b[0m\n",
      "'We will safeguard Britain ' s credit rating with a credible plan to eliminate the bulk of the structural deficit over a Parliament .'\n",
      "                                                                                                                                     \t(sjasmin)\n",
      "\u001b[1m8985fd4074356709a83361871251b493\u001b[0m\n",
      "'We will safeguard Britain ' s credit rating with a credible plan to eliminate the bulk of the structural deficit over a Parliament .'\n",
      "                                                                                                                         \u001b[44m          \u001b[49m  \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m4fc7978444ea6e15e32546810c327b42\u001b[0m\n",
      "'Labour will continue to press for an immediate return to meaningful negotiations leading to a diplomatic resolution .'\n",
      " \u001b[44m      \u001b[49m                                                                                                               \t(sjasmin)\n",
      "\u001b[1m3bb7cadf4a27352343942da4d7105bba\u001b[0m\n",
      "'Labour will continue to press for an immediate return to meaningful negotiations leading to a diplomatic resolution .'\n",
      " \u001b[44m      \u001b[49m                                                                                                               \t(emarie)\n",
      " \u001b[44m      \u001b[49m                                                                                                               \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m80a20e4ca558c46e213f0ca0dc510503\u001b[0m\n",
      "'We will reintroduce library standards so that government can assess and guide councils in delivering the best possible service .'\n",
      "                                               \u001b[44m          \u001b[49m                      \u001b[44m        \u001b[49m                                          \t(emarie)\n",
      "\u001b[1m641a442625e212061d547e542e27cde2\u001b[0m\n",
      "'We will reintroduce library standards so that government can assess and guide councils in delivering the best possible service .'\n",
      "                                               \u001b[44m          \u001b[49m                      \u001b[44m        \u001b[49m                                          \t(sjasmin)\n"
     ]
    }
   ],
   "source": [
    "# print\n",
    "if args.verbose:\n",
    "    for ids in duplicates_ids.values():\n",
    "        print('\\n', '-'*100, sep='')\n",
    "        for id in ids:\n",
    "            print(acorp.docs[acorp.doc_id2idx[id]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note: I've manually checked the cases where these very duplicate annotations.\n",
    "#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>\n",
    "#  But in the few cases where this does not hold, I manually disambiguate.\n",
    "disambigute_duplicates = {\n",
    "    'b8fcbc767a1d9c51ff8b79a45c46bcb8': ['b8fcbc767a1d9c51ff8b79a45c46bcb8', '760b6392c10197cada53339542796332'],\n",
    "    '54c25996cbee486c56a6b0fdfd2f7c79': ['54c25996cbee486c56a6b0fdfd2f7c79', '2a89d4f048351fe43b0102ccdab00859'],\n",
    "    'ec49785ff7550ab699a28fa560cc7178': ['ec49785ff7550ab699a28fa560cc7178', '3e3eef0abdbfd4ff0301995486b06ae5'],\n",
    "    '3d30f73ceaecd66835bf90e0970948cd': ['3d30f73ceaecd66835bf90e0970948cd', 'c0055bf852b96fcfcaaa6b51556ade5f']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# resolve duplicates: for duplicated texts\n",
    "for ids in duplicates_ids.values():\n",
    "    # see in all but the first doc (the 'original')\n",
    "    if all([id in disambigute_duplicates.values() for id in ids]):\n",
    "        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]\n",
    "        for _ in disambigute_duplicates[this[0]]:\n",
    "            if _ not in this:\n",
    "                acorp.remove_documents([_])\n",
    "    for id in ids[1:]:\n",
    "        # id = ids[1]\n",
    "        # for each annotator\n",
    "        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:\n",
    "            # whether the annotator already in the 'original'\n",
    "            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:\n",
    "                # if so remove annotation\n",
    "                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)\n",
    "        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:\n",
    "            acorp.merge_annotations([ids[0], id])\n",
    "        else:\n",
    "            acorp.remove_documents([id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1]), array([8576]))\n"
     ]
    }
   ],
   "source": [
    "# verify\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])\n",
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reset important corpus attributes\n",
    "acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}\n",
    "acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}\n",
    "acorp.annotator_label_counts = acorp._count_annotator_labels()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clean tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "toks = set()\n",
    "all_chars = Counter()\n",
    "for doc in acorp.docs:\n",
    "    for tok in doc.tokens:\n",
    "        toks.add(tok)\n",
    "        all_chars.update([c for c in tok])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cc\tControl\t['\\x91', '\\x92']\n",
      "Co\tPrivate Use\t['\\uf02f']\n",
      "No\tOther Number\t['½']\n",
      "Pd\tDash Punctuation\t['-', '–']\n",
      "Pe\tClose Punctuation\t[')', ']']\n",
      "Pf\tFinal Punctuation\t['’', '”']\n",
      "Pi\tInitial Punctuation\t['‘', '“']\n",
      "Po\tOther Punctuation\t[',', '.', ':', '?', '%', ';', '&', '/', \"'\", '\"', '·']\n",
      "Ps\tOpen Punctuation\t['(']\n",
      "Sc\tCurrency Symbol\t['£', '€']\n",
      "Sk\tModifier Symbol\t['^']\n",
      "Sm\tMath Symbol\t['+', '¬', '<', '>']\n"
     ]
    }
   ],
   "source": [
    "from utils.unicode import CATEGORIES as char_cats\n",
    "\n",
    "del char_cats['Ll']\n",
    "del char_cats['Lu']\n",
    "del char_cats['Nd']\n",
    "\n",
    "for k, v in char_cats.items():\n",
    "    regx = r'\\p{'+k+'}'\n",
    "    m = [c for c in all_chars.keys() if regex.match(regx, c)]\n",
    "    if len(m) > 0:\n",
    "        print(k, end='\\t')\n",
    "        print(v, end='\\t')\n",
    "        print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "c60c93194a1f4b2d3b8a725c1ea05734 ['We', 'will', 'reverse', 'Tory', 'policies', 'on', 'the', 'privatisation', 'of', 'local', 'authority', 'services', '.', '<', '/', 'p>']\n"
     ]
    }
   ],
   "source": [
    "for doc in acorp.docs:\n",
    "    if '<' in doc.tokens:\n",
    "        print(doc.id, doc.tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1mc60c93194a1f4b2d3b8a725c1ea05734\u001b[0m\n",
      "'We will reverse Tory policies on the privatisation of local authority services . < / p>'\n",
      "                 \u001b[44m    \u001b[49m                                  \u001b[44m     \u001b[49m \u001b[44m         \u001b[49m                  \t(emarie)\n",
      "                 \u001b[44m    \u001b[49m                                  \u001b[44m     \u001b[49m \u001b[44m         \u001b[49m \u001b[44m        \u001b[49m         \t(sjasmin)\n"
     ]
    }
   ],
   "source": [
    "print(acorp.docs[acorp.doc_id2idx['c60c93194a1f4b2d3b8a725c1ea05734']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc = acorp.docs[acorp.doc_id2idx['c60c93194a1f4b2d3b8a725c1ea05734']]\n",
    "doc.tokens = doc.tokens[:-3]\n",
    "for a in doc.annotations.keys():\n",
    "    doc.annotations[a] = doc.annotations[a][:-3]\n",
    "for l in doc.labels.keys():\n",
    "    doc.labels[l] = doc.labels[l][:-3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "replace_chars = {    \n",
    "    # Cc\n",
    "    '\\x91': '\"',\n",
    "    '\\x92': '\"',\n",
    "    # Co\n",
    "    u'\\uf02f': '',\n",
    "    # No\n",
    "    '½': '1/2',\n",
    "    # Po\n",
    "    '·': '',\n",
    "    # Sk\n",
    "    '\\^': ' ',\n",
    "    # Sm\n",
    "    '¬': '-'\n",
    "}\n",
    "\n",
    "for p, r in replace_chars.items():\n",
    "    p = re.compile(p, re.UNICODE)\n",
    "    for i in range(acorp.ndocs):\n",
    "        acorp.docs[i].text = re.sub(p, r, acorp.docs[i].text)\n",
    "        acorp.docs[i].tokens = [re.sub(p, r, tok) for tok in acorp.docs[i].tokens]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Write to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(os.path.dirname(args.output_file), exist_ok=True)\n",
    "if not os.path.exists(args.output_file) or args.overwrite_output:\n",
    "    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "group_mention_detection",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
